import numpy as np
print(f"NumPy version: {np.__version__}")
print(f"NumPy path: {np.__path__}")
import pandas as pd
print(f"Pandas version: {pd.__version__}")
print(f"Pandas path: {pd.__path__}")
NumPy version: 1.24.2 NumPy path: ['C:\\Users\\ddj6tu\\AppData\\Roaming\\Python\\Python311\\site-packages\\numpy']
C:\Users\ddj6tu\AppData\Roaming\Python\Python311\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed). from pandas.core import (
Pandas version: 2.2.3 Pandas path: ['C:\\Users\\ddj6tu\\AppData\\Roaming\\Python\\Python311\\site-packages\\pandas']
import seaborn as sns
import nltk
import re
from glob import glob
import matplotlib.pyplot as plt
source_file_list = sorted(glob("C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\*.*"))
source_file_list
['C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_AsteroidCity.txt', 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_BottleRocket.txt', 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_FrenchDispatch.txt', 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_GrandBudapestHotel.txt', 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_MoonriseKingdom.txt', 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_RoyalTennenbaums.txt', 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_Rushmore.txt', 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\salex_nrc.csv']
script_id = list(range(1,8))
years = [2023, 1996, 2021, 2014, 2012, 2001, 1998]
eras = ['late', 'early', 'late', 'middle', 'middle', 'early', 'early']
titles = ['Asteroid City', 'Bottle Rocket', 'French Dispatch', 'Grand Budapest Hotel', 'Moonrise Kingdom', 'Royal Tennenbaums', 'Rushmore']
LIB = pd.DataFrame(list(zip(script_id, titles, years, eras, source_file_list)),
columns=['script_id', 'title', 'years', 'era', 'source']).set_index('script_id').sort_index()
LIB
| title | years | era | source | |
|---|---|---|---|---|
| script_id | ||||
| 1 | Asteroid City | 2023 | late | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... |
| 2 | Bottle Rocket | 1996 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... |
| 3 | French Dispatch | 2021 | late | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... |
| 4 | Grand Budapest Hotel | 2014 | middle | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... |
| 5 | Moonrise Kingdom | 2012 | middle | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... |
| 6 | Royal Tennenbaums | 2001 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... |
| 7 | Rushmore | 1998 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... |
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
ohco_pat_list = [
(1, r"^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:|INSERT:|CUT TO:|MONTAGE:)"),
(2, r"^\s*(EXT\.|INT\.|EXT/INT\.)"),
(3, r"^\s*(Obituary|EXT\.|CUT TO:|MONTAGE:|In the File Room:|INT\.|Sketchbook|SPLIT-SCREEN:|Story #1|TITLE:|Story #2|INSERT:|INT/EXT\.|Split-screen:)"),
(4, r"^\s*(EXT\.|INT\.|MONTAGE:|CUT TO:|INSERT:|TITLE:)"),
(5, r"^\s*(INT\.|EXT\.|TITLES OVER:|CUT TO:|INSERT:|MONTAGE:|CUT TO:|TITLE:)"),
(6, r"^\s*(INSERT:|CUT TO:|INT\.|EXT\.|MONTAGE:)"),
(7, r"^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARCH MONTAGE:| OCTOBER MONTAGE:|THANKSGIVING MONTAGE:|DECEMBER MONTAGE:)")
]
LIB['scene_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))
LIB
| title | years | era | source | scene_regex | |
|---|---|---|---|---|---|
| script_id | |||||
| 1 | Asteroid City | 2023 | late | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... |
| 2 | Bottle Rocket | 1996 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(EXT\.|INT\.|EXT/INT\.) |
| 3 | French Dispatch | 2021 | late | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(Obituary|EXT\.|CUT TO:|MONTAGE:|In the Fi... |
| 4 | Grand Budapest Hotel | 2014 | middle | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(EXT\.|INT\.|MONTAGE:|CUT TO:|INSERT:|TITLE:) |
| 5 | Moonrise Kingdom | 2012 | middle | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|EXT\.|TITLES OVER:|CUT TO:|INSERT:|... |
| 6 | Royal Tennenbaums | 2001 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INSERT:|CUT TO:|INT\.|EXT\.|MONTAGE:) |
| 7 | Rushmore | 1998 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... |
clip_pats = [
r'START OF SCRIPT',
r'END OF SCRIPT'
]
import pandas as pd
import numpy as np
import nltk
class TextParser():
"""
A class to parse a single Gutenberg-type text file into a TOKENS dataframe with
an OHCO index. Also has methods to extract a VOCAB table, although vocabulary
tables ought to be generated at the corpus level.
Sample parameter values:
ohco_pats = [
('chapter', r"^\s*(chapter|letter)\s+(\d+)", 'm')
]
clip_pats = [
r'START OF GUTENBERG PROJECT',
r'^\s*THE END'
]
"""
# TODO: Make these private
src_imported:bool = False
src_clipped:bool = False
src_col_suffix:str ='_str'
join_pat:str = r'\n'
strip_hyphens:bool = False
strip_whitespace:bool = False
verbose:bool = False
stanford_pos_model:str = "english-bidirectional-distsim.tagger"
stanford_pos_model_path = None
# We assume all OHCOs have sentences and tokens
# and that there are terminal in the list.
ohco_pats:[] = [
('para', r"\n\n", 'd'),
('sent', r"[.?!;:]+", 'd'),
('token', r"[\s',-]+", 'd')
]
_ohco_type:{} = {
'd': '_num',
'm': '_id'
}
def __init__(self, src_file:str, ohco_pats:[], clip_pats:[], use_nltk=True):
"""Initialize the object and extract config info. If using NLTK, download resources."""
self.src_file = src_file
self.clip_pats = clip_pats # TODO: Validate
self.ohco_pats = ohco_pats + self.ohco_pats # TODO: Validate
self.OHCO = [item[0]+self._ohco_type[item[2]] for item in self.ohco_pats]
self.ohco_names = [item[0] for item in self.ohco_pats]
self.use_nltk = use_nltk
if self.use_nltk:
# Override the last two OHCO items
self.ohco_pats[-2] = ('sent', None, 'nltk')
self.ohco_pats[-1] = ('token', None, 'nltk')
# Make sure you have the NLTK stuff
for package in [
'tokenizers/punkt',
'taggers/averaged_perceptron_tagger',
'corpora/stopwords',
'help/tagsets'
]:
if self.verbose: print("Checking", package)
try:
nltk.data.find(package)
except IndexError:
nltk.download(package)
def import_source(self, strip:bool=True, char_encoding:str="utf-8-sig"):
"""Convert a raw text file into a dataframe of lines."""
if self.verbose: print("Importing ", self.src_file)
text_lines = open(self.src_file,'r', encoding=char_encoding).readlines()
self.LINES = pd.DataFrame({'line_str':text_lines})
self.LINES.index.name = 'line_id'
if strip:
self.LINES.line_str = self.LINES.line_str.str.strip()
self.src_imported = True
if self.verbose: print("Clipping text")
self._clip_lines()
return self
def _clip_lines(self):
"""Remove cruft lines from beginning and/or end of file."""
start_pat = self.clip_pats[0]
end_pat = self.clip_pats[1]
start = self.LINES.line_str.str.contains(start_pat, regex=True)
end = self.LINES.line_str.str.contains(end_pat, regex=True)
try:
start_line_num = self.LINES.loc[start].index[0]
except IndexError:
raise ValueError("Clip start pattern not found.")
try:
end_line_num = self.LINES.loc[end].index[0]
except IndexError:
raise ValueError("Clip end pattern not found.")
self.LINES = self.LINES.loc[start_line_num + 1 : end_line_num - 2]
self.src_clipped == True
def parse_tokens(self):
"""Convert lines to tokens based on OHCO."""
if self.src_imported:
# Start with the LINES df
self.TOKENS = self.LINES.copy()
# Walk through each level of the OHCO to build out TOKENS
for i, level in enumerate(self.OHCO):
if self.verbose: print(f"Parsing OHCO level {i} {level}", end=' ')
# Define level-specific variables
parse_type = self.ohco_pats[i][2]
div_name = self.ohco_pats[i][0]
div_pat = self.ohco_pats[i][1]
if i == 0:
src_div_name = 'line'
else:
src_div_name = self.ohco_names[i - 1]
src_col = f"{src_div_name}{self.src_col_suffix}"
dst_col = f"{div_name}{self.src_col_suffix}"
# By Milestone
if parse_type == 'm':
if self.verbose: print(f"by milestone {div_pat}")
div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
self.TOKENS.loc[div_lines, div_name] = [i+1 for i in range(self.TOKENS.loc[div_lines].shape[0])]
self.TOKENS[div_name] = self.TOKENS[div_name].ffill()
self.TOKENS = self.TOKENS.loc[~self.TOKENS[div_name].isna()]
self.TOKENS = self.TOKENS.loc[~div_lines]
self.TOKENS[div_name] = self.TOKENS[div_name].astype('int')
self.TOKENS = self.TOKENS.groupby(self.ohco_names[:i+1], group_keys=True)[src_col]\
.apply(lambda x: '\n'.join(x)).to_frame(dst_col)
# print(self.TOKENS[dst_col].str.count(r'\n\n'))
print(src_col, dst_col)
print(self.TOKENS.columns)
# By Delimitter
elif parse_type == 'd':
if self.verbose: print(f"by delimitter {div_pat}")
self.TOKENS = self.TOKENS[src_col].str.split(div_pat, expand=True).stack().to_frame(dst_col)
# By NLTK
elif parse_type == 'nltk':
if self.verbose: print(f"by NLTK model")
if level == 'sent_num':
self.TOKENS = self.TOKENS.para_str\
.apply(lambda x: pd.Series(nltk.sent_tokenize(x), dtype='string'))\
.stack()\
.to_frame('sent_str')
if level == 'token_num':
if self.strip_hyphens == True:
self.TOKENS.sent_str = self.TOKENS.sent_str.str.replace(r"-", ' ')
if self.strip_whitespace == True:
self.TOKENS = self.TOKENS.sent_str\
.apply(lambda x: pd.Series(
nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)),
dtype='object'
)
)
else:
self.TOKENS = self.TOKENS.sent_str\
.apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))
self.TOKENS = self.TOKENS.stack().to_frame('pos_tuple')
self.TOKENS['pos'] = self.TOKENS.pos_tuple.apply(lambda x: x[1])
self.TOKENS['token_str'] = self.TOKENS.pos_tuple.apply(lambda x: x[0])
self.TOKENS['term_str'] = self.TOKENS.token_str.str.lower()
else:
raise ValueError(f"Invalid parse option: {parse_type}.")
# After creating the current OHCO level
self.TOKENS.index.names = self.OHCO[:i+1]
# After iterating through the OHCO
if not self.use_nltk:
self.TOKENS['term_str'] = self.TOKENS.token_str.str.replace(r'[\W_]+', '', regex=True).str.lower()
else:
punc_pos = ['$', "''", '(', ')', ',', '--', '.', ':', '``']
self.TOKENS['term_str'] = self.TOKENS[~self.TOKENS.pos.isin(punc_pos)].token_str\
.str.replace(r'[\W_]+', '', regex=True).str.lower()
else:
raise RuntimeError("Source not imported. Please run .import_source()")
def extract_vocab(self):
"""This should also be done at the corpus level."""
self.VOCAB = self.TOKENS.term_str.value_counts().to_frame('n')
self.VOCAB.index.name = 'term_str'
self.VOCAB['n_chars'] = self.VOCAB.index.str.len()
self.VOCAB['p'] = self.VOCAB['n'] / self.VOCAB['n'].sum()
self.VOCAB['s'] = 1 / self.VOCAB['p']
self.VOCAB['i'] = np.log2(self.VOCAB['s']) # Same as negative log probability (i.e. log likelihood)
self.VOCAB['h'] = self.VOCAB['p'] * self.VOCAB['i']
self.H = self.VOCAB['h'].sum()
return self
def annotate_vocab(self):
"""This should be done at the corpus level."""
# Stopwords
# Max POS
# POS variability
# Porter Stems
pass
def extract_pos_data(self):
# TODO: Create dataframe for POS info, including Penn Treebank info
pass
def extract_named_entities(self):
# TODO: Create dataframe of named entities
pass
def gather_tokens(self, level=0, grouping_col='term_str', cat_sep=' '):
"""Gather tokens into strings for arbitrary OHCO level."""
max_level = len(self.OHCO) - 2 # Can't gather tokens at the token level :)
if level > max_level:
raise ValueError(f"Level {level} too high. Try between 0 and {max_level}")
else:
level_name = self.OHCO[level].split('_')[0]
idx = self.TOKENS.index.names[:level+1]
return self.TOKENS.groupby(idx)[grouping_col].apply(lambda x: x.str.cat(sep=cat_sep))\
.to_frame(f'{level_name}_str')
if __name__ == '__main__':
pass
def tokenize_collection(LIB):
books = []
for script_id in LIB.index:
# Announce
print("Tokenizing", script_id, LIB.loc[script_id].title)
# Define vars
scene_regex = LIB.loc[script_id].scene_regex
ohco_pats = [('scene', scene_regex, 'm')]
src_file_path = LIB.loc[script_id].source
# Create object
text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)
# Define parameters
text.verbose = True
text.strip_hyphens = True
text.strip_whitespace = True
# Parse
text.import_source().parse_tokens();
# Name things
text.TOKENS['script_id'] = script_id
text.TOKENS = text.TOKENS.reset_index().set_index(['script_id'] + text.OHCO)
# Add to list
books.append(text.TOKENS)
# Combine into a single dataframe
CORPUS = pd.concat(books).sort_index()
# Clean up
del(books)
del(text)
print("Done")
return CORPUS
CORPUS = tokenize_collection(LIB)
Tokenizing 1 Asteroid City Importing C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_AsteroidCity.txt Clipping text Parsing OHCO level 0 scene_id by milestone ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:|INSERT:|CUT TO:|MONTAGE:) line_str scene_str Index(['scene_str'], dtype='object') Parsing OHCO level 1 para_num by delimitter \n\n Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract. div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model Tokenizing 2 Bottle Rocket Importing C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_BottleRocket.txt Clipping text Parsing OHCO level 0 scene_id by milestone ^\s*(EXT\.|INT\.|EXT/INT\.) line_str scene_str Index(['scene_str'], dtype='object') Parsing OHCO level 1 para_num by delimitter \n\n Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract. div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model Tokenizing 3 French Dispatch Importing C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_FrenchDispatch.txt Clipping text Parsing OHCO level 0 scene_id by milestone ^\s*(Obituary|EXT\.|CUT TO:|MONTAGE:|In the File Room:|INT\.|Sketchbook|SPLIT-SCREEN:|Story #1|TITLE:|Story #2|INSERT:|INT/EXT\.|Split-screen:) line_str scene_str Index(['scene_str'], dtype='object') Parsing OHCO level 1 para_num by delimitter \n\n Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract. div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model Tokenizing 4 Grand Budapest Hotel Importing C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_GrandBudapestHotel.txt Clipping text Parsing OHCO level 0 scene_id by milestone ^\s*(EXT\.|INT\.|MONTAGE:|CUT TO:|INSERT:|TITLE:) line_str scene_str Index(['scene_str'], dtype='object') Parsing OHCO level 1 para_num by delimitter \n\n Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract. div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model Tokenizing 5 Moonrise Kingdom Importing C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_MoonriseKingdom.txt Clipping text Parsing OHCO level 0 scene_id by milestone ^\s*(INT\.|EXT\.|TITLES OVER:|CUT TO:|INSERT:|MONTAGE:|CUT TO:|TITLE:) line_str scene_str Index(['scene_str'], dtype='object') Parsing OHCO level 1 para_num by delimitter \n\n Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract. div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model Tokenizing 6 Royal Tennenbaums Importing C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_RoyalTennenbaums.txt Clipping text Parsing OHCO level 0 scene_id by milestone ^\s*(INSERT:|CUT TO:|INT\.|EXT\.|MONTAGE:) line_str scene_str Index(['scene_str'], dtype='object') Parsing OHCO level 1 para_num by delimitter \n\n Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract. div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model Tokenizing 7 Rushmore Importing C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_Rushmore.txt Clipping text Parsing OHCO level 0 scene_id by milestone ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARCH MONTAGE:| OCTOBER MONTAGE:|THANKSGIVING MONTAGE:|DECEMBER MONTAGE:) line_str scene_str Index(['scene_str'], dtype='object') Parsing OHCO level 1 para_num by delimitter \n\n Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract. div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model Done
CORPUS
| pos_tuple | pos | token_str | term_str | |||||
|---|---|---|---|---|---|---|---|---|
| script_id | scene_id | para_num | sent_num | token_num | ||||
| 1 | 1 | 0 | 0 | 0 | (Black, NNP) | NNP | Black | black |
| 1 | (and, CC) | CC | and | and | ||||
| 2 | (white., NN) | NN | white. | white | ||||
| 1 | 0 | 0 | (A, DT) | DT | A | a | ||
| 1 | (1950's, CD) | CD | 1950's | 1950s | ||||
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7 | 118 | 134 | 2 | 15 | (everyone, NN) | NN | everyone | everyone |
| 16 | (slowly, RB) | RB | slowly | slowly | ||||
| 17 | (begins, VBZ) | VBZ | begins | begins | ||||
| 18 | (to, TO) | TO | to | to | ||||
| 19 | (dance., VB) | VB | dance. | dance |
177116 rows × 4 columns
nltk.download('averaged_perceptron_tagger_eng')
[nltk_data] Downloading package averaged_perceptron_tagger_eng to [nltk_data] C:\Users\ddj6tu\AppData\Roaming\nltk_data... [nltk_data] Package averaged_perceptron_tagger_eng is already up-to- [nltk_data] date!
True
LIB['movie_len'] = CORPUS.groupby('script_id').term_str.count()
LIB['n_scenes'] = CORPUS.reset_index()[['script_id','scene_id']]\
.drop_duplicates()\
.groupby('script_id').scene_id.count()
LIB
| title | years | era | source | scene_regex | movie_len | n_scenes | |
|---|---|---|---|---|---|---|---|
| script_id | |||||||
| 1 | Asteroid City | 2023 | late | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... | 27624 | 62 |
| 2 | Bottle Rocket | 1996 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(EXT\.|INT\.|EXT/INT\.) | 19464 | 94 |
| 3 | French Dispatch | 2021 | late | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(Obituary|EXT\.|CUT TO:|MONTAGE:|In the Fi... | 30868 | 182 |
| 4 | Grand Budapest Hotel | 2014 | middle | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(EXT\.|INT\.|MONTAGE:|CUT TO:|INSERT:|TITLE:) | 27185 | 177 |
| 5 | Moonrise Kingdom | 2012 | middle | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|EXT\.|TITLES OVER:|CUT TO:|INSERT:|... | 24877 | 138 |
| 6 | Royal Tennenbaums | 2001 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INSERT:|CUT TO:|INT\.|EXT\.|MONTAGE:) | 24939 | 222 |
| 7 | Rushmore | 1998 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... | 21535 | 118 |
LIB['movie_len'].mean()
25213.14285714286
CORPUS = CORPUS[CORPUS.term_str != '']
CORPUS = CORPUS[CORPUS.pos != '']
CORPUS['pos_group'] = CORPUS.pos.str[:2]
CORPUS
| pos_tuple | pos | token_str | term_str | pos_group | |||||
|---|---|---|---|---|---|---|---|---|---|
| script_id | scene_id | para_num | sent_num | token_num | |||||
| 1 | 1 | 0 | 0 | 0 | (Black, NNP) | NNP | Black | black | NN |
| 1 | (and, CC) | CC | and | and | CC | ||||
| 2 | (white., NN) | NN | white. | white | NN | ||||
| 1 | 0 | 0 | (A, DT) | DT | A | a | DT | ||
| 1 | (1950's, CD) | CD | 1950's | 1950s | CD | ||||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7 | 118 | 134 | 2 | 15 | (everyone, NN) | NN | everyone | everyone | NN |
| 16 | (slowly, RB) | RB | slowly | slowly | RB | ||||
| 17 | (begins, VBZ) | VBZ | begins | begins | VB | ||||
| 18 | (to, TO) | TO | to | to | TO | ||||
| 19 | (dance., VB) | VB | dance. | dance | VB |
177087 rows × 5 columns
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['n_pos'] = CORPUS[['term_str','pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = CORPUS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
.groupby('term_str').pos.apply(lambda x: set(x))
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)
from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)
from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)
VOCAB
| n | n_chars | p | i | max_pos | max_pos_group | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||||
| 1 | 41 | 1 | 0.000232 | 12.071454 | NNP | NN | 4 | {CD, PDT, NN, NNP} | 0 | 1 | 1 | 1 |
| 10 | 12 | 2 | 0.000068 | 13.844044 | CD | CD | 4 | {CD, VBZ, NN, NNP} | 0 | 10 | 10 | 10 |
| 100 | 8 | 3 | 0.000045 | 14.429006 | CD | CD | 1 | {CD} | 0 | 100 | 100 | 100 |
| 10000 | 1 | 5 | 0.000006 | 17.429006 | CD | CD | 1 | {CD} | 0 | 10000 | 10000 | 10000 |
| 100111 | 1 | 6 | 0.000006 | 17.429006 | CD | CD | 1 | {CD} | 0 | 100111 | 100111 | 100111 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| zubrowkian | 3 | 10 | 0.000017 | 15.844044 | JJ | JJ | 2 | {JJ, NNP} | 0 | zubrowkian | zubrowkian | zubrowk |
| à | 3 | 1 | 0.000017 | 15.844044 | NN | NN | 2 | {NN, NNP} | 0 | à | à | à |
| éclair | 1 | 6 | 0.000006 | 17.429006 | NNP | NN | 1 | {NNP} | 0 | éclair | éclair | éclair |
| éclairs | 1 | 7 | 0.000006 | 17.429006 | NNS | NN | 1 | {NNS} | 0 | éclair | éclair | éclairs |
| école | 1 | 5 | 0.000006 | 17.429006 | NNP | NN | 1 | {NNP} | 0 | école | école | écol |
15306 rows × 12 columns
sw = pd.DataFrame({'stop': 1}, index=nltk.corpus.stopwords.words('english'))
sw.index.name='term_str'
if 'stop' not in VOCAB.columns:
VOCAB = VOCAB.join(sw)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')
VOCAB = VOCAB[VOCAB.stop == 0]
OHCO = ['script_id', 'scene_id', 'para_num', 'sent_num', 'token_num']
PARA = OHCO[:3]
CHAP = OHCO[:2]
BOOK = OHCO[:1]
SENT = OHCO[:4]
tf_method = 'max'
bag = SENT
vocab_filter = 'dfidf'
n_terms = 1000
def create_bow(CORPUS, bag, item_type='term_str'):
BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
return BOW
BOW = create_bow(CORPUS, SENT)
BOW
| n | |||||
|---|---|---|---|---|---|
| script_id | scene_id | para_num | sent_num | term_str | |
| 1 | 1 | 0 | 0 | and | 1 |
| black | 1 | ||||
| white | 1 | ||||
| 1 | 0 | 1950s | 1 | ||
| a | 1 | ||||
| ... | ... | ... | ... | ... | ... |
| 7 | 118 | 134 | 2 | takes | 1 |
| the | 2 | ||||
| to | 1 | ||||
| walks | 1 | ||||
| with | 1 |
167833 rows × 1 columns
DTM = BOW.n.unstack(fill_value=0)
DTM
| term_str | 1 | 10 | 100 | 10000 | 100111 | 101 | 10111 | 101111 | 102 | 102111 | ... | zoom | zooms | zs | zubrowka | zubrowkaofficially | zubrowkian | à | éclair | éclairs | école | |||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| script_id | scene_id | para_num | sent_num | |||||||||||||||||||||
| 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ||
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |||
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |||
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7 | 118 | 133 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |||
| 134 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ||
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |||
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
23483 rows × 15306 columns
VOCAB['df'] = DTM.astype('bool').sum()
VOCAB['idf'] = np.log2(len(DTM) / VOCAB.df)
VOCAB['dfidf'] = VOCAB.df * VOCAB.idf
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\1021891658.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
VOCAB['df'] = DTM.astype('bool').sum()
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\1021891658.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
VOCAB['idf'] = np.log2(len(DTM) / VOCAB.df)
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\1021891658.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
VOCAB['dfidf'] = VOCAB.df * VOCAB.idf
VOCAB
| n | n_chars | p | i | max_pos | max_pos_group | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | df | idf | dfidf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||||||||
| 1 | 41 | 1 | 0.000232 | 12.071454 | NNP | NN | 4 | {CD, PDT, NN, NNP} | 0 | 1 | 1 | 1 | 40 | 9.197401 | 367.896040 |
| 10 | 12 | 2 | 0.000068 | 13.844044 | CD | CD | 4 | {CD, VBZ, NN, NNP} | 0 | 10 | 10 | 10 | 12 | 10.934367 | 131.212399 |
| 100 | 8 | 3 | 0.000045 | 14.429006 | CD | CD | 1 | {CD} | 0 | 100 | 100 | 100 | 8 | 11.519329 | 92.154633 |
| 10000 | 1 | 5 | 0.000006 | 17.429006 | CD | CD | 1 | {CD} | 0 | 10000 | 10000 | 10000 | 1 | 14.519329 | 14.519329 |
| 100111 | 1 | 6 | 0.000006 | 17.429006 | CD | CD | 1 | {CD} | 0 | 100111 | 100111 | 100111 | 1 | 14.519329 | 14.519329 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| zubrowkian | 3 | 10 | 0.000017 | 15.844044 | JJ | JJ | 2 | {JJ, NNP} | 0 | zubrowkian | zubrowkian | zubrowk | 3 | 12.934367 | 38.803100 |
| à | 3 | 1 | 0.000017 | 15.844044 | NN | NN | 2 | {NN, NNP} | 0 | à | à | à | 3 | 12.934367 | 38.803100 |
| éclair | 1 | 6 | 0.000006 | 17.429006 | NNP | NN | 1 | {NNP} | 0 | éclair | éclair | éclair | 1 | 14.519329 | 14.519329 |
| éclairs | 1 | 7 | 0.000006 | 17.429006 | NNS | NN | 1 | {NNS} | 0 | éclair | éclair | éclairs | 1 | 14.519329 | 14.519329 |
| école | 1 | 5 | 0.000006 | 17.429006 | NNP | NN | 1 | {NNP} | 0 | école | école | écol | 1 | 14.519329 | 14.519329 |
15173 rows × 15 columns
VOCAB.dfidf.sort_values(ascending=False).head(20)
term_str max 4283.756574 mr 4056.413705 looks 3958.476797 dignan 3914.095042 anthony 3386.125779 royal 3151.175932 gustave 3014.477170 back 2870.570211 one 2689.799238 says 2656.320299 dont 2481.588591 sam 2438.135107 bob 2345.754584 vo 2278.796777 im 2238.218371 blume 2169.891364 pause 2156.118937 zero 2137.699071 suzy 2086.704994 door 2049.298688 Name: dfidf, dtype: float64
TFIDF = (DTM.T / DTM.T.max()).T * VOCAB.idf
VIDX = VOCAB[VOCAB.max_pos.isin(['NN','NNS'])].sort_values('dfidf', ascending=False).head(n_terms).index
VOCAB.loc[VIDX].sort_index()
| n | n_chars | p | i | max_pos | max_pos_group | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | df | idf | dfidf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||||||||
| 2 | 49 | 1 | 0.000278 | 11.814296 | NN | NN | 5 | {NN, NNP, CD, VBP, CC} | 0 | 2 | 2 | 2 | 49 | 8.904619 | 436.326344 |
| aback | 8 | 5 | 0.000045 | 14.429006 | NN | NN | 3 | {JJ, NN, NNP} | 0 | aback | aback | aback | 8 | 11.519329 | 92.154633 |
| accent | 12 | 6 | 0.000068 | 13.844044 | NN | NN | 2 | {VBZ, NN} | 0 | accent | accent | acc | 12 | 10.934367 | 131.212399 |
| actor | 55 | 5 | 0.000312 | 11.647646 | NN | NN | 3 | {JJ, NN, NNP} | 0 | actor | actor | act | 50 | 8.875473 | 443.773646 |
| actors | 10 | 6 | 0.000057 | 14.107078 | NNS | NN | 3 | {NNS, NN, JJ} | 0 | actor | actor | act | 10 | 11.197401 | 111.974010 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| yards | 13 | 5 | 0.000074 | 13.728566 | NNS | NN | 2 | {NNS, NN} | 0 | yard | yard | yard | 13 | 10.818889 | 140.645562 |
| year | 81 | 4 | 0.000459 | 11.089156 | NN | NN | 3 | {CD, NN, JJ} | 0 | year | year | year | 77 | 8.252543 | 635.445778 |
| years | 73 | 5 | 0.000414 | 11.239182 | NNS | NN | 3 | {NNS, NN, JJ} | 0 | year | year | year | 73 | 8.329505 | 608.053832 |
| yet | 33 | 3 | 0.000187 | 12.384612 | NN | NN | 7 | {NNS, NN, NNP, VBN, VB, RB, CC} | 0 | yet | yet | yet | 32 | 9.519329 | 304.618531 |
| yo | 16 | 2 | 0.000091 | 13.429006 | NN | NN | 2 | {NN, NNP} | 0 | yo | yo | yo | 9 | 11.349404 | 102.144637 |
1000 rows × 15 columns
TFIDF_RED = TFIDF[VIDX]
TFIDF_RED
| term_str | vo | pause | door | room | nods | hand | front | hands | man | right | ... | text | shoots | studio | warning | aback | coldly | pipes | candle | observatory | pressure | |||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| script_id | scene_id | para_num | sent_num | |||||||||||||||||||||
| 1 | 1 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ||
| 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |||
| 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |||
| 3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 11.519329 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7 | 118 | 133 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |||
| 134 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ||
| 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.48041 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |||
| 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
23483 rows × 1000 columns
from sklearn.decomposition import PCA
from scipy.linalg import norm
import plotly_express as px
import seaborn as sns
n_comps = 5
pc_cols = [f"PC{i}" for i in range(n_comps)]
pca_engine = PCA(n_components=n_comps)
DCM = pd.DataFrame(pca_engine.fit_transform(TFIDF.fillna(0)), index=TFIDF.index)
DCM.columns = pc_cols # ['PC{}'.format(i) for i in DCM.columns]
DCM = DCM.join(LIB, on='script_id')
DCM['doc'] = DCM.apply(lambda x: "{}-{}".format(x.title, x.name[1]), 1)
LOADINGS = pd.DataFrame(pca_engine.components_.T * np.sqrt(pca_engine.explained_variance_))
LOADINGS.columns = ["PC{}".format(i) for i in LOADINGS.columns]
LOADINGS.index = TFIDF.columns
LOADINGS.index.name = 'term_str'
LOADINGS = LOADINGS.join(VOCAB)
def vis_pcs(a=0, b=1, label='title', hover_name='doc', symbol=None, size=None):
return px.scatter(DCM, f"PC{a}", f"PC{b}",
color=label, hover_name=hover_name,
symbol=symbol, size=size,
marginal_x='box', height=800)
vis_pcs(0, 1)
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning. sf: grouped.get_group(s if len(s) > 1 else s[0])
vis_pcs(2, 3)
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
vis_pcs(2, 3)
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
def vis_loadings(a=0, b=1, hover_name='term_str'):
return px.scatter(LOADINGS.reset_index(), f"PC{a}", f"PC{b}",
text='term_str', hover_name='term_str',
size='dfidf', color='max_pos_group',
marginal_x='box', height=800)
vis_loadings(0, 1)
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
vis_loadings(2, 3)
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
top_terms_sk= {}
data = []
for i in range(n_comps):
for j in [0, 1]:
data.append((f"PC{i}", j, ' '.join(LOADINGS.sort_values(f'PC{i}', ascending=bool(j)).head(10).index.to_list())))
comp_strs = pd.DataFrame(data)
comp_strs.columns = ['pc', 'pole', 'top_terms']
comp_strs = comp_strs.set_index(['pc', 'pole'])
COMPS = comp_strs
COMPS
| top_terms | ||
|---|---|---|
| pc | pole | |
| PC0 | 0 | scout master ward captain sharp bishop mr mrs ... |
| 1 | dignan anthony bob max royal cross miss dont k... | |
| PC1 | 0 | mr blume bishop henry moustafa max mrs fischer... |
| 1 | dignan scout master ward anthony dont bob roya... | |
| PC2 | 0 | dignan anthony bob mr dont know henry blume ge... |
| 1 | miss cross max gustave zero says sam eyes sits... | |
| PC3 | 0 | miss cross dignan anthony bob max looks master... |
| 1 | vo gustave captain sharp royal wright roebuck ... | |
| PC4 | 0 | max looks back royal anthony richie around cha... |
| 1 | vo cross miss wright roebuck dont captain shar... |
DOCS = CORPUS[CORPUS.pos.str.match(r'^NNS?$')]\
.groupby(SENT).term_str\
.apply(lambda x: ' '.join(x))\
.to_frame()\
.rename(columns={'term_str':'doc_str'})
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import plotly_express as px
ngram_range = (1, 2)
n_terms = 4000
n_topics = 20
max_iter = 5
n_top_terms = 9
count_engine = CountVectorizer(max_features=n_terms, ngram_range=ngram_range, stop_words='english')
count_model = count_engine.fit_transform(DOCS.doc_str)
TERMS = count_engine.get_feature_names_out()
VOCAB2 = pd.DataFrame(index=TERMS)
VOCAB2.index.name = 'term_str'
DTM2 = pd.DataFrame(count_model.toarray(), index=DOCS.index, columns=TERMS)
VOCAB2['doc_count'] = DTM2.astype('bool').astype('int').sum()
DOCS['term_count'] = DTM2.sum(1)
lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)
TNAMES = [f"T{str(x).zfill(len(str(n_topics)))}" for x in range(n_topics)]
lda_model = lda_engine.fit_transform(count_model)
THETA = pd.DataFrame(lda_model, index=DOCS.index)
THETA.columns.name = 'topic_id'
THETA.columns = TNAMES
PHI = pd.DataFrame(lda_engine.components_, columns=TERMS, index=TNAMES)
PHI.index.name = 'topic_id'
PHI.columns.name = 'term_str'
THETA
| T00 | T01 | T02 | T03 | T04 | T05 | T06 | T07 | T08 | T09 | T10 | T11 | T12 | T13 | T14 | T15 | T16 | T17 | T18 | T19 | ||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| script_id | scene_id | para_num | sent_num | ||||||||||||||||||||
| 1 | 1 | 0 | 0 | 0.525000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 1 | 0 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.455268 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.244732 | 0.016667 | 0.016667 | ||
| 1 | 0.016667 | 0.016667 | 0.016667 | 0.683333 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | |||
| 2 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.525000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | |||
| 3 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.810000 | |||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7 | 118 | 132 | 2 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.525000 |
| 133 | 0 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.525000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | ||
| 1 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.683333 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | |||
| 134 | 1 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.525000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | ||
| 2 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.207715 | 0.612285 |
18938 rows × 20 columns
TOPICS = PHI.stack().groupby('topic_id')\
.apply(lambda x: ' '.join(x.sort_values(ascending=False).head(n_top_terms).reset_index().term_str))\
.to_frame('top_terms')
TOPICS
| top_terms | |
|---|---|
| topic_id | |
| T00 | eyes smiles end telephone pocket guy hes mitch... |
| T01 | door minute street manager play mouth motel wi... |
| T02 | day girls troop rest record looks note express... |
| T03 | way years box water author children arm father... |
| T04 | face ground year group desk life place actor s... |
| T05 | book girl hair watch eye office radio cover case |
| T06 | hands car man glass frowns doors pair elevator... |
| T07 | look sir mother bed sound jacket set coffee ma... |
| T08 | school quietly thing men booth mirror seat she... |
| T09 | silence page house sorry bathroom good home sa... |
| T10 | room shrugs bottle away shirt glasses waiter t... |
| T11 | hand time table woman middle sky business job ... |
| T12 | pause nods scout course feet stares margaret b... |
| T13 | right floor morning roof night light fingers c... |
| T14 | window head cigarette station picture point ha... |
| T15 | people lights youre building alien second scou... |
| T16 | boy points tent wall doorway work paper room dont |
| T17 | max vo air space family dirk huh friend breath |
| T18 | zero hesitates walks sidewalk guard distance m... |
| T19 | voice corner kind arms moment yeah foot sighs lot |
THE_LIB = THETA.join(LIB)
pca = PCA(n_components=2)
pc = pca.fit_transform(PHI)
pca_df = pd.DataFrame(data=pc, columns=['PC1', 'PC2'], index=PHI.index)
mean_topic_weight = THETA.mean(axis=0)
mean_topic_weight.name = 'MeanWeight'
topic_info = pca_df.join(mean_topic_weight)
def get_topic_title(topic_name, the_lib, title_column='title'):
relevant_doc = the_lib.nlargest(1, topic_name)
if not relevant_doc.empty:
return relevant_doc[title_column].iloc[0]
return None
THE_LIB
| T00 | T01 | T02 | T03 | T04 | T05 | T06 | T07 | T08 | T09 | ... | T17 | T18 | T19 | title | years | era | source | scene_regex | movie_len | n_scenes | ||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| script_id | scene_id | para_num | sent_num | |||||||||||||||||||||
| 1 | 1 | 0 | 0 | 0.525000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | Asteroid City | 2023 | late | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... | 27624 | 62 |
| 1 | 0 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.455268 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | ... | 0.244732 | 0.016667 | 0.016667 | Asteroid City | 2023 | late | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... | 27624 | 62 | ||
| 1 | 0.016667 | 0.016667 | 0.016667 | 0.683333 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | ... | 0.016667 | 0.016667 | 0.016667 | Asteroid City | 2023 | late | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... | 27624 | 62 | |||
| 2 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.525000 | 0.025000 | 0.025000 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | Asteroid City | 2023 | late | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... | 27624 | 62 | |||
| 3 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | ... | 0.010000 | 0.010000 | 0.810000 | Asteroid City | 2023 | late | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... | 27624 | 62 | |||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7 | 118 | 132 | 2 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | ... | 0.025000 | 0.025000 | 0.525000 | Rushmore | 1998 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... | 21535 | 118 |
| 133 | 0 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | Rushmore | 1998 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... | 21535 | 118 | ||
| 1 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | 0.016667 | ... | 0.016667 | 0.016667 | 0.016667 | Rushmore | 1998 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... | 21535 | 118 | |||
| 134 | 1 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | Rushmore | 1998 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... | 21535 | 118 | ||
| 2 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | 0.010000 | ... | 0.010000 | 0.207715 | 0.612285 | Rushmore | 1998 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... | 21535 | 118 |
18938 rows × 27 columns
topic_titles = {}
for topic in PHI.index:
topic_titles[topic] = get_topic_title(topic, THE_LIB)
topic_title_series = pd.Series(topic_titles, name='title')
topic_df_vis = topic_info.join(topic_title_series)
topic_df_vis = topic_df_vis.dropna(subset=['title'])
def vis_loadings(a=1, b=2, hover_name=topic_df_vis.index.name or 'topic'):
pc_x = f"PC{a}"
pc_y = f"PC{b}"
return px.scatter(topic_df_vis.reset_index(), x=pc_x, y=pc_y,
text=topic_df_vis.index,
hover_name=hover_name,
size='MeanWeight',
color='title',
marginal_x='box',
height=800,
title=f"PCA of Topics (PC{a} vs PC{b})",
labels={'PC1': 'Principal Component 1',
'PC2': 'Principal Component 2',
'MeanWeight': 'Mean Topic Weight',
'title': 'Dominant Title'})
fig = vis_loadings(1, 2)
fig.show()
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
salex_csv = 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\salex_nrc.csv'
SALEX = pd.read_csv(salex_csv).set_index('term_str')
SALEX.columns = [col.replace('nrc_','') for col in SALEX.columns]
SALEX
| anger | anticipation | disgust | fear | joy | negative | positive | sadness | surprise | trust | sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||||
| abandon | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | -1 |
| abandoned | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | -1 |
| abandonment | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | -1 |
| abduction | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | -1 |
| aberration | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | -1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| young | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
| youth | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
| zeal | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 1 |
| zealous | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 |
| zest | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 |
3688 rows × 11 columns
VOCAB_SENT = pd.concat([VOCAB.reset_index().set_index('term_str'), SALEX], join='inner', axis=1)
VOCAB_SENT
| n | n_chars | p | i | max_pos | max_pos_group | n_pos | cat_pos | stop | stem_porter | ... | anticipation | disgust | fear | joy | negative | positive | sadness | surprise | trust | sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||||||||||||||
| abandon | 3 | 7 | 0.000017 | 15.844044 | VB | VB | 1 | {VB} | 0 | abandon | ... | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | -1 |
| abandoned | 6 | 9 | 0.000034 | 14.844044 | VBD | VB | 3 | {JJ, VBN, VBD} | 0 | abandon | ... | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | -1 |
| absence | 1 | 7 | 0.000006 | 17.429006 | JJ | JJ | 1 | {JJ} | 0 | absenc | ... | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | -1 |
| absent | 1 | 6 | 0.000006 | 17.429006 | NN | NN | 1 | {NN} | 0 | absent | ... | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | -1 |
| abuse | 1 | 5 | 0.000006 | 17.429006 | NN | NN | 1 | {NN} | 0 | abus | ... | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | -1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| wound | 5 | 5 | 0.000028 | 15.107078 | NN | NN | 1 | {NN} | 0 | wound | ... | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | -1 |
| wreck | 1 | 5 | 0.000006 | 17.429006 | NN | NN | 1 | {NN} | 0 | wreck | ... | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | -1 |
| yell | 3 | 4 | 0.000017 | 15.844044 | VBP | VB | 3 | {VB, NN, VBP} | 0 | yell | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | -1 |
| young | 88 | 5 | 0.000499 | 10.969575 | JJ | JJ | 5 | {JJ, NN, NNP, VB, VBP} | 0 | young | ... | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
| youth | 4 | 5 | 0.000023 | 15.429006 | NN | NN | 2 | {NN, NNP} | 0 | youth | ... | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
1361 rows × 26 columns
emo_cols = "anger anticipation disgust fear joy sadness surprise trust sentiment".split()
BOW_SENT = BOW.join(VOCAB_SENT[['max_pos'] + emo_cols], on='term_str', rsuffix='_v').dropna()
BOW_SENT
| n | max_pos | anger | anticipation | disgust | fear | joy | sadness | surprise | trust | sentiment | |||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| script_id | scene_id | para_num | sent_num | term_str | |||||||||||
| 1 | 1 | 0 | 0 | black | 1 | JJ | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | -1.0 |
| white | 1 | JJ | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | ||||
| 1 | 5 | addresses | 1 | VBZ | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ||
| 4 | 2 | authentic | 1 | JJ | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | ||
| fabrication | 1 | NNP | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -1.0 | ||||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7 | 118 | 125 | 0 | dance | 1 | NN | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| 132 | 0 | music | 1 | NN | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | ||
| 134 | 0 | cross | 1 | NNP | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | -1.0 | ||
| 1 | cross | 1 | NNP | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | -1.0 | |||
| 2 | dance | 2 | NN | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 |
8237 rows × 11 columns
DOC_SENT = BOW_SENT.pivot_table(index='script_id', columns='term_str', values=['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust', 'sentiment'], fill_value=0)
DOC_SENT
| anger | ... | trust | |||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | abandon | abandoned | absence | absent | abuse | abyss | academic | accident | accidental | accompaniment | ... | worry | worrying | worse | worship | worthless | wound | wreck | yell | young | youth |
| script_id | |||||||||||||||||||||
| 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 5 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 6 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 7 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
7 rows × 12249 columns
DOC_SENT_SENTIMENT = DOC_SENT['sentiment']
SENT_DF = pd.merge(DOC_SENT_SENTIMENT, LIB, left_index=True, right_on='script_id', how='inner')
SENT_DF
| abandon | abandoned | absence | absent | abuse | abyss | academic | accident | accidental | accompaniment | ... | yell | young | youth | title_y | years | era | source | scene_regex | movie_len | n_scenes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| script_id | |||||||||||||||||||||
| 1 | -1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -1.0 | 0.0 | 1.0 | ... | 0.0 | 1.0 | 0.0 | Asteroid City | 2023 | late | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... | 27624 | 62 |
| 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -1.0 | 0.0 | ... | -1.0 | 1.0 | 0.0 | Bottle Rocket | 1996 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(EXT\.|INT\.|EXT/INT\.) | 19464 | 94 |
| 3 | 0.0 | -1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -1.0 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 1.0 | French Dispatch | 2021 | late | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(Obituary|EXT\.|CUT TO:|MONTAGE:|In the Fi... | 30868 | 182 |
| 4 | 0.0 | -1.0 | -1.0 | -1.0 | 0.0 | -1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -1.0 | 1.0 | 0.0 | Grand Budapest Hotel | 2014 | middle | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(EXT\.|INT\.|MONTAGE:|CUT TO:|INSERT:|TITLE:) | 27185 | 177 |
| 5 | 0.0 | 0.0 | 0.0 | 0.0 | -1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | Moonrise Kingdom | 2012 | middle | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|EXT\.|TITLES OVER:|CUT TO:|INSERT:|... | 24877 | 138 |
| 6 | 0.0 | -1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | -1.0 | 1.0 | 0.0 | Royal Tennenbaums | 2001 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INSERT:|CUT TO:|INT\.|EXT\.|MONTAGE:) | 24939 | 222 |
| 7 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -1.0 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | Rushmore | 1998 | early | C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... | ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... | 21535 | 118 |
7 rows × 1368 columns
grouped_df = SENT_DF.groupby('years')
sentiment_by_year = grouped_df.apply(lambda x: x.iloc[:, 0:1361].apply(pd.to_numeric, errors='coerce').mean()).reset_index()
fig = px.line(
sentiment_by_year,
x='years',
y=sentiment_by_year.columns[1],
title='Mean Sentiment by Year',
labels={'years': 'Year'},
markers=True
)
fig.show()
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\3179877830.py:3: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
docs = CORPUS[~CORPUS.pos.str.match('NNPS?')].dropna(subset=['term_str'])\
.groupby(SENT)\
.term_str.apply(lambda x: x.tolist())\
.reset_index()['term_str'].tolist()
docs = [doc for doc in docs if len(doc) > 1]
import pandas as pd
import numpy as np
from gensim.models import word2vec
from gensim.corpora import Dictionary
from sklearn.manifold import TSNE as tsne
import plotly_express as px
import gensim
gensim.__version__
'4.3.3'
# word2vec parameters
w2v_params = dict(
window = 5,
vector_size = 246,
min_count = 50, # THIS LIMITS OUR VOCAB
workers = 4
)
vocab = Dictionary(docs)
model = word2vec.Word2Vec(docs, **w2v_params)
vocab = model.wv.index_to_key
word_vectors = model.wv.get_normed_vectors()
W2V = pd.DataFrame(word_vectors, index=vocab)
W2V
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| the | 0.061662 | -0.114587 | -0.013411 | 0.049237 | -0.019735 | -0.048255 | 0.011538 | -0.044835 | -0.169090 | -0.063417 | ... | 0.012361 | 0.024137 | -0.008331 | -0.039298 | -0.005002 | -0.054072 | -0.043507 | -0.027745 | 0.083137 | 0.076604 |
| a | 0.045642 | -0.095899 | 0.000209 | 0.043699 | 0.002286 | -0.059206 | 0.012981 | -0.059315 | -0.185802 | -0.079814 | ... | 0.010301 | -0.000763 | 0.016885 | -0.034742 | 0.007390 | -0.057769 | -0.047956 | -0.042989 | 0.054543 | 0.066632 |
| and | 0.063052 | -0.129563 | -0.008656 | 0.043561 | 0.000264 | -0.060959 | -0.018276 | -0.054806 | -0.186765 | -0.064324 | ... | 0.002699 | 0.025620 | 0.002572 | -0.032011 | 0.002405 | -0.064659 | -0.030632 | -0.025463 | 0.082791 | 0.063598 |
| of | 0.041239 | -0.110470 | 0.003377 | 0.037904 | 0.006403 | -0.070319 | -0.015618 | -0.074511 | -0.188593 | -0.081859 | ... | -0.001532 | -0.017108 | 0.030993 | -0.043745 | 0.007583 | -0.073758 | -0.030773 | -0.032773 | 0.051215 | 0.073910 |
| to | -0.005363 | -0.045059 | 0.021593 | 0.013028 | 0.002145 | -0.048232 | -0.129133 | 0.006744 | -0.083075 | -0.009304 | ... | -0.048809 | 0.083383 | 0.021390 | 0.002768 | 0.041702 | -0.069906 | -0.009895 | -0.027302 | 0.030008 | 0.024399 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| every | 0.015675 | -0.073669 | 0.023438 | 0.022841 | -0.003873 | -0.057982 | -0.094052 | -0.020734 | -0.140234 | -0.046660 | ... | -0.033663 | 0.072350 | 0.013848 | -0.019384 | 0.028436 | -0.077645 | -0.008306 | -0.035662 | 0.051916 | 0.040544 |
| let | -0.027250 | -0.008303 | 0.034657 | 0.001060 | -0.000434 | -0.041714 | -0.118841 | 0.016291 | -0.061241 | -0.012690 | ... | -0.050286 | 0.084009 | 0.018705 | 0.005458 | 0.044029 | -0.064054 | 0.009572 | -0.034025 | 0.014033 | 0.011031 |
| ten | 0.019050 | -0.082811 | 0.020524 | 0.026385 | 0.000223 | -0.060145 | -0.085252 | -0.025562 | -0.146130 | -0.044765 | ... | -0.028616 | 0.069053 | 0.017981 | -0.018235 | 0.029832 | -0.076593 | -0.011033 | -0.034493 | 0.052572 | 0.043054 |
| chair | 0.038542 | -0.115938 | 0.007069 | 0.038383 | -0.002248 | -0.061542 | -0.050139 | -0.044249 | -0.179698 | -0.057000 | ... | -0.010139 | 0.044378 | 0.015401 | -0.029592 | 0.013941 | -0.073150 | -0.019933 | -0.030727 | 0.069193 | 0.053102 |
| etheline | 0.021201 | -0.087620 | 0.017109 | 0.027859 | -0.004503 | -0.058980 | -0.090568 | -0.028105 | -0.151700 | -0.045393 | ... | -0.033249 | 0.070590 | 0.016367 | -0.020575 | 0.026713 | -0.078587 | -0.010485 | -0.031704 | 0.057964 | 0.044489 |
330 rows × 246 columns
def get_vector(row):
w = row.name
try:
vec = model.wv[w]
except KeyError as e:
vec = None
return vec
WV = pd.DataFrame(VOCAB.apply(get_vector, axis=1).dropna()).apply(lambda x: pd.Series(x[0]), axis=1)
tsne_engine = tsne(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne_model = tsne_engine.fit_transform(WV.to_numpy())
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\sklearn\manifold\_t_sne.py:1164: FutureWarning: 'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.
TSNE = pd.DataFrame(tsne_model, columns=['x','y'], index=WV.index)
X = TSNE.join(VOCAB, how='left')
px.scatter(X.reset_index(), 'x', 'y',
text='term_str',
color='max_pos',
hover_name='term_str',
size='dfidf',
height=1000).update_traces(
mode='markers+text',
textfont=dict(color='black', size=14, family='Arial'),
textposition='top center')
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
EMO_BOOKS = BOW_SENT.groupby(['script_id'])[emo_cols].mean()
EMO_CHAPS = BOW_SENT.groupby(['script_id','scene_id'])[emo_cols].mean()
EMO_BOOKS.index = LIB.title
EMO_BOOKS.plot.barh(figsize=(15,30));
plt.savefig('Vis1.png')
ERAS = LIB.groupby('era')[['movie_len', 'n_scenes']].mean()
ERAS.style.background_gradient()
| movie_len | n_scenes | |
|---|---|---|
| era | ||
| early | 21979.333333 | 144.666667 |
| late | 29246.000000 | 122.000000 |
| middle | 26031.000000 | 157.500000 |
VOCAB['n2'] = DTM.sum()
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\3643906384.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
VOCAB['p2'] = VOCAB.n2 / VOCAB.n2.sum()
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2070894287.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
VOCAB['i2'] = -np.log2(VOCAB.p2)
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\208349894.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
VOCAB['dp'] = VOCAB.df / len(LIB)
VOCAB['di'] = np.log2(1/VOCAB.dp)
VOCAB['dh'] = VOCAB.dp * VOCAB.di
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\3670820122.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\3670820122.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\3670820122.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
px.scatter(VOCAB.reset_index(), x='i2', y='dfidf',
hover_name='term_str', hover_data=['n'],
color='max_pos',
height=500, width=800)
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
output_dir = "C:/Users/ddj6tu/Documents/GitHub\DS5001/Final_Project_ddj6tu/output"
data_prefix = "ANDERSON"
LIB.to_csv(f"{output_dir}/{data_prefix}-LIB.csv")
CORPUS.to_csv(f"{output_dir}/{data_prefix}-CORPUS.csv")
VOCAB.to_csv(f"{output_dir}/{data_prefix}-VOCAB.csv")
BOW.to_csv(f"{output_dir}/{data_prefix}-BOW.csv")
DTM.to_csv(f"{output_dir}/{data_prefix}-DTM.csv")
TFIDF.to_csv(f"{output_dir}/{data_prefix}-TFIDF.csv")
TFIDF_RED.to_csv(f"{output_dir}/{data_prefix}-TFIDF_RED.csv")
COMPS.to_csv(f"{output_dir}/{data_prefix}-COMPS.csv")
DCM.to_csv(f"{output_dir}/{data_prefix}-DCM.csv")
LOADINGS.to_csv(f"{output_dir}/{data_prefix}-LOADINGS.csv")
TOPICS.to_csv(f"{output_dir}/{data_prefix}-TOPICS.csv")
THETA.to_csv(f"{output_dir}/{data_prefix}-THETA.csv")
PHI.to_csv(f"{output_dir}/{data_prefix}-PHI.csv")
VOCAB_SENT.to_csv(f"{output_dir}/{data_prefix}-VOCAB_SENT.csv")
BOW_SENT.to_csv(f"{output_dir}/{data_prefix}-BOW_SENT.csv")
DOC_SENT.to_csv(f"{output_dir}/{data_prefix}-DOC_SENT.csv")
W2V.to_csv(f"{output_dir}/{data_prefix}-W2V.csv")